Tempo and Beat Detection¶

Testing librosa tempo detection

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import IPython.display as ipd
import librosa
import librosa.display
import math
import glob
import os
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

Let's load a snippet of a song.

In [2]:
TEST_CASE_INDEX = 1

test_cases = [
    # these are all pretty good
    {'src': '**/*Ed*', 'known_tempo': 126, 'start': 60.34, 'len': 2.5 },
    {'src': '**/*Ed*', 'known_tempo': 126, 'start': 60.34, 'len': 5 },
    {'src': '**/*Ed*', 'known_tempo': 126, 'start': 60.34, 'len': 10 },
    {'src': '**/*Ed*', 'known_tempo': 126, 'start': 60.34, 'len': 20 },
    
    # this song has more complicated rhythm, first case is 1.33x tempo, and sounds triplet feel.
    # second is closer to tempo, but still off sounding. 3rd, long 20s window, sounds great
    {'src': '**/*Dua*Rules*', 'known_tempo': 116, 'start': 60.80, 'len': 5 },
    {'src': '**/*Dua*Rules*', 'known_tempo': 116, 'start': 60.80, 'len': 10 },
    {'src': '**/*Dua*Rules*', 'known_tempo': 116, 'start': 80.80, 'len': 20 },
    
    # 
    
    # simple metronomes
    {'src': '**/*126 BPM*', 'known_tempo': 126, 'start': 22.8, 'len': 5 },
    {'src': '**/*120 BPM*', 'known_tempo': 120, 'start': 10,   'len': 5 },
]

test_case = test_cases[TEST_CASE_INDEX]
test_case['end'] = test_case['start'] + test_case['len']

src = glob.glob(test_case['src'])[0]
y, sr = librosa.load(src, sr=48000, offset=test_case['start'], duration=test_case['len'])

ipd.display(pd.DataFrame([[sr, len(y), len(y.shape)]],
                         columns=["Sample rate Hz", "Num Samples", "Channels"]).style.hide())
ipd.Audio(y, rate=sr)
Sample rate Hz Num Samples Channels
48000 240000 1
Out[2]:
Your browser does not support the audio element.

Now use built in beat track method, and then generate a click track at those detected beats and overlay that click audio onto detected segment.

Librosa beat_track returns two outputs, the assumed detected tempo or BPM (beats per minute), as well as an array of detected beat events (in seconds time or sample time). As far as BPM specifically, we can compare the reported tempo to the tempo(s) inferred by the times (e.g. average) between these detected beat events.

Also note, depending on test audio input source, the number of detected beat events is often lower than expected (both if you listen for kick drum or clicks in the sample audio vs inferred number of beats from known tempo and snippet length).

In [3]:
HOP_LENGTH = 256

def predict_beats(y, sr):
    predict_env = librosa.onset.onset_strength_multi
    onset_env = predict_env(y=y, sr=sr,
                            hop_length=HOP_LENGTH,
                            aggregate=np.median, # default is mean
                            lag=1, # default, unit? "time lag for computing differences"
                            max_size=1, # default, do not filter freq bins
                            detrend=False, # default, do not "filter onset strength to remove DC component"
                            center=True, # Centered frame analysis in STFT, by hop length
                           )
    onset_env = onset_env[..., 0, :]
    # HOP_LENGTH = 512
    # predict_env = librosa.onset.onset_strength
    # onset_env = predict_env(y=y, sr=sr,
    #                         # hop_length=HOP_LENGTH,
    #                         aggregate=np.median, # default is mean
    #                         lag=1, # default, unit? "time lag for computing differences"
    #                         max_size=1, # default, do not filter freq bins
    #                         detrend=False, # default, do not "filter onset strength to remove DC component"
    #                         center=True, # Centered frame analysis in STFT, by hop length
    #                         )

    predict_beats = librosa.beat.beat_track
    return predict_beats(onset_envelope=onset_env, sr=sr, units='time',
                         hop_length=HOP_LENGTH,
                         tightness=1000, # yikers island, what does this do... good? 800 1000, bad 400 600 1600
                         # start_bpm=126,
                         #    trim=False,
                         )

reported_tempo, beats = predict_beats(y, sr)

expected_beats = math.floor(test_case['known_tempo'] * test_case['len'] / 60.0)
ipd.display(pd.DataFrame([
    ['Reported tempo', reported_tempo],
    ['Averaged tempo', 60 / np.average(np.diff(beats))],
    ['Num beats detected vs expected', f"{len(beats)} vs {expected_beats}"],
]).style.hide(axis="columns").hide())

click_track = librosa.clicks(times=beats, sr=sr, length=len(y))
ipd.Audio(y + click_track, rate=sr)
Reported tempo 126.404494
Averaged tempo 126.720901
Num beats detected vs expected 10 vs 10
Out[3]:
Your browser does not support the audio element.

In the past, the reported tempo from Librosa was often not as good as the one indicated by the detected beats, not sure what may have changed regarding my inputs / knob fiddling, nor what the overall status is like now.

In [4]:
pd.DataFrame([
    ['Reported', reported_tempo],
    ['Averaged', 60 / np.average(np.diff(beats))],
    ['Min', 60 / np.max(np.diff(beats))],
    ['Max', 60 / np.min(np.diff(beats))],
    ['Median', 60 / np.median(np.diff(beats))],
    ['-','-'],
    ['Known', test_case['known_tempo']],
    ['Known seconds per beat', 60 / test_case['known_tempo'] ],
    ['Averaged seconds per beat', np.average(np.diff(beats))],
], columns=["Method", "BPM"]).style.hide()
Out[4]:
Method BPM
Reported 126.404494
Averaged 126.720901
Min 125.000000
Max 130.813953
Median 126.404494
- -
Known 126
Known seconds per beat 0.476190
Averaged seconds per beat 0.473481

Now we get into using the detected past time window's beats and applying it to recent future audio.

Let's use that prediction and overlay the would be assumed beats onto the next chunk of the track and see how it sounds. This first method is to duplicate and add the beat events shifted over.

In [11]:
# first load twice as much audio
duration = 2.0 * test_case['len']
future, _ = librosa.load(src, sr=sr, offset=test_case['start'], duration=duration)

future_beats = np.array(beats)
diffs = np.diff(beats)
beats_added = 0
while future_beats[-1] < duration:
    future_beats = np.append(future_beats, future_beats[-1] + diffs[beats_added % len(diffs)])
    beats_added = beats_added + 1

future_click = librosa.clicks(times=future_beats, sr=sr, length=len(future))
ipd.Audio(future + future_click, rate=sr)
22
Out[11]:
Your browser does not support the audio element.

Or choose just a single beat as anchor (e.g. first or last one) and use one of the reported/derived tempos as constant spacing to create overlayed click track. Perhaps, it's here that implementing decent fitting / selecting of both tempo and anchor could yield better results overall...

In [15]:
spb = np.average(np.diff(beats))
constant_bpm_clicks = [beats[0]]
while constant_bpm_clicks[-1] < test_case['end']:
    constant_bpm_clicks = constant_bpm_clicks + [ constant_bpm_clicks[-1] + spb ]
ipd.Audio(future + librosa.clicks(times=constant_bpm_clicks, sr=sr, length=len(future)), rate=sr)
Out[15]:
Your browser does not support the audio element.
In [26]:
# tempo_used = 60 / np.average(np.diff(beats))
tempo_used = reported_tempo
spb = 60 / tempo_used
beat1_ests = [beat - k * spb for k, beat in enumerate(beats)]
anchor = np.mean(beat1_ests)
beat_ests = np.arange(anchor, duration, spb)

constant_bpm_clicks = [anchor]
while constant_bpm_clicks[-1] < test_case['end']:
    constant_bpm_clicks = constant_bpm_clicks + [ constant_bpm_clicks[-1] + spb ]
ipd.Audio(future + librosa.clicks(times=constant_bpm_clicks, sr=sr, length=len(future)), rate=sr)
Out[26]:
Your browser does not support the audio element.

These might be good enough?

With minimal testing (read: a few cases... 😬), either sounds OK?

Next Steps(?):¶

  • Improve the above Librosa calls (try other params?), test with multiple sound files (especially ones with multiple songs started, stoppped, transitioned to, etc) and past time windows. (These now ancient notes indicate we can probably improve using the librosa output with even basic / brute-force overlay fitting of the detected beat tempo + sample times?).

  • Apply "smoothing". This is where statistics / math nerds might be able to quickly help? I imagine, if I get all the above working, the predictions will vary in accuracy, plus we must remember that anyone can stop a currently playing song and play another one of entirely different tempo. It's luckily not the end of the world if the beat sync is wildly off (I hope), especially for a short time, but it would nice if we could strike a balance that weights the last N calcs / M minutes of tempos, with abrupt changes (stops, starts, new songs, etc.).

  • More on smoothing, the Dua Lipa track, it sounds like when I tried different window variance it may detect the same beats but like "dotted" / 1.5x type spacing, both BPM and beats. So I wonder if that's able to be factored into (probably more and more non-trivial) smoothing / combining techniques. And another smoothing related thought: could try multiple length windows and weight them somehow (e.g. 5s, 10s, 20s, 60s into the past)?

  • Add visual rendering of waveform, with past window's beat detections and future predictions marked.

  • Use real time audio stream. Obviously, this notebook uses pre-baked audio files for quick demo / testing. In this repo, I've started code that uses Python audio lib(s) to listen to a real time audio device stream (like what would be played live, for beat detection and sync), store the samples in a ring buffer, and use the same librosa code to run short-windowed-into-the-past beat detection aglos. Needs revisit + clean up. IN PROGRESS

  • Take that prediction output, and make sure it is sample time syncrhonized / accurate with the real time audio input samples and wall-clock time.

  • Fire OSC LX Studio compatible events synchronized to predictions.

  • Create LX Studio beat specific FX, namely to test + demo. (Can be done in parallel, before / while actual tempo improvements are being made). DONE

SCRATCH:¶

In [7]:
a = [1.3888, 2, 3.37, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 9, 8, 7, 0, 10, 11]
# np.ediff1d(np.r_[0, a == 0, 0]).nonzero()[0].reshape(-1, 2)

def find_zero_runs(a):
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    iszero = np.concatenate(([0], np.equal(a, 0), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges
zr = find_zero_runs(a)
zr = list(map(list, list(zr)))
ipd.display(zr)

zr = list(filter(lambda x: x[1] - x[0] > 2, zr))
zr
[[3, 9], [12, 16], [19, 20]]
Out[7]:
[[3, 9], [12, 16]]
In [8]:
a = [1.3888, 2, 3.37, -1, -2, 0, 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 9, 8, 7, 0, 10, 11]
a = np.sign(a)
a = np.diff(a)
a
Out[8]:
array([ 0.,  0., -2.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
       -1.,  0.,  0.,  0.,  1.,  0.,  0., -1.,  1.,  0.])
In [9]:
y, sr = librosa.load('/tmp/sigh.wav', sr=48000)
y
---------------------------------------------------------------------------
LibsndfileError                           Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/librosa/core/audio.py:176, in load(path, sr, mono, offset, duration, dtype, res_type)
    175 try:
--> 176     y, sr_native = __soundfile_load(path, offset, duration, dtype)
    178 except sf.SoundFileRuntimeError as exc:
    179     # If soundfile failed, try audioread instead

File /opt/conda/lib/python3.10/site-packages/librosa/core/audio.py:209, in __soundfile_load(path, offset, duration, dtype)
    207 else:
    208     # Otherwise, create the soundfile object
--> 209     context = sf.SoundFile(path)
    211 with context as sf_desc:

File /opt/conda/lib/python3.10/site-packages/soundfile.py:658, in SoundFile.__init__(self, file, mode, samplerate, channels, subtype, endian, format, closefd)
    656 self._info = _create_info_struct(file, mode, samplerate, channels,
    657                                  format, subtype, endian)
--> 658 self._file = self._open(file, mode_int, closefd)
    659 if set(mode).issuperset('r+') and self.seekable():
    660     # Move write position to 0 (like in Python file objects)

File /opt/conda/lib/python3.10/site-packages/soundfile.py:1216, in SoundFile._open(self, file, mode_int, closefd)
   1215     err = _snd.sf_error(file_ptr)
-> 1216     raise LibsndfileError(err, prefix="Error opening {0!r}: ".format(self.name))
   1217 if mode_int == _snd.SFM_WRITE:
   1218     # Due to a bug in libsndfile version <= 1.0.25, frames != 0
   1219     # when opening a named pipe in SFM_WRITE mode.
   1220     # See http://github.com/erikd/libsndfile/issues/77.

LibsndfileError: Error opening '/tmp/sigh.wav': System error.

During handling of the above exception, another exception occurred:

FileNotFoundError                         Traceback (most recent call last)
Cell In[9], line 1
----> 1 y, sr = librosa.load('/tmp/sigh.wav', sr=48000)
      2 y

File /opt/conda/lib/python3.10/site-packages/librosa/core/audio.py:184, in load(path, sr, mono, offset, duration, dtype, res_type)
    180 if isinstance(path, (str, pathlib.PurePath)):
    181     warnings.warn(
    182         "PySoundFile failed. Trying audioread instead.", stacklevel=2
    183     )
--> 184     y, sr_native = __audioread_load(path, offset, duration, dtype)
    185 else:
    186     raise exc

File /opt/conda/lib/python3.10/site-packages/decorator.py:232, in decorate.<locals>.fun(*args, **kw)
    230 if not kwsyntax:
    231     args, kw = fix(args, kw, sig)
--> 232 return caller(func, *(extras + args), **kw)

File /opt/conda/lib/python3.10/site-packages/librosa/util/decorators.py:60, in deprecated.<locals>.__wrapper(func, *args, **kwargs)
     51 """Warn the user, and then proceed."""
     52 warnings.warn(
     53     "{:s}.{:s}\n\tDeprecated as of librosa version {:s}."
     54     "\n\tIt will be removed in librosa version {:s}.".format(
   (...)
     58     stacklevel=3,  # Would be 2, but the decorator adds a level
     59 )
---> 60 return func(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/librosa/core/audio.py:241, in __audioread_load(path, offset, duration, dtype)
    238     reader = path
    239 else:
    240     # If the input was not an audioread object, try to open it
--> 241     reader = audioread.audio_open(path)
    243 with reader as input_file:
    244     sr_native = input_file.samplerate

File /opt/conda/lib/python3.10/site-packages/audioread/__init__.py:127, in audio_open(path, backends)
    125 for BackendClass in backends:
    126     try:
--> 127         return BackendClass(path)
    128     except DecodeError:
    129         pass

File /opt/conda/lib/python3.10/site-packages/audioread/rawread.py:59, in RawAudioFile.__init__(self, filename)
     58 def __init__(self, filename):
---> 59     self._fh = open(filename, 'rb')
     61     try:
     62         self._file = aifc.open(self._fh)

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sigh.wav'
In [ ]:
sr
In [ ]:
zr = find_zero_runs(y)
zr
In [ ]:
zero_runs = list(map(list, zr))
if len(zero_runs) >= 4:
    bpm = 60 / ((zero_runs[2][1] - zero_runs[1][1]) / sr)
    beep_len = ((zero_runs[2][0] - zero_runs[1][1] + 1) / sr)
bpm
beep_len
In [ ]: